Purpose: Remove Ensembl (ESNG) gene identifier in the mutation frequency tables, including SNV, CNV and fusion, TPM summary statistics and methylation summary tables that are not in Open Targets target list, and updated to match Open Target disease list.
To run this from the command line, use:
Rscript -e "rmarkdown::render('01-filter-mtp-tables-for-current-gencode.Rmd', clean = TRUE)"
This assumes you are in the modules directory of the repository, OpenPedCan-analysis/analyses/filter-mutation-frequency-tables.
Load libraries
# R analysis packages
suppressWarnings(
suppressPackageStartupMessages(library(rtracklayer))
)
suppressPackageStartupMessages(library(tidyverse))
suppressPackageStartupMessages(library(data.table))
suppressPackageStartupMessages(library(jsonlite))
# Magrittr pipe
`%>%` <- dplyr::`%>%`
Set up directories.
# directories for input and output files
root_dir <- rprojroot::find_root(rprojroot::has_dir(".git"))
data_dir <- file.path(root_dir, "data")
scratch_dir <- file.path(root_dir, "scratch")
analyses_dir <- file.path(root_dir, "analyses")
module_dir <- file.path(analyses_dir, "filter-mtp-tables")
mtp_annot_results_dir <- file.path(analyses_dir, "mtp-annotations/results")
gene_match_input_dir <- file.path(analyses_dir, "gene_match/input")
input_dir <- file.path(scratch_dir, "mtp-commit")
output_dir <- file.path(scratch_dir, "mtp-filtered")
results_dir <- file.path(module_dir, "results")
# Create results folder if it doesn't exist
if (!dir.exists(results_dir)) {
dir.create(results_dir)
}
Get Open Targets GENCODE Ensembl identifiers
# filter for gene rows and get ids
ensg_ids <- readr::read_tsv(file.path(mtp_annot_results_dir,
"mtp-targets-mapping.tsv.gz")) %>%
dplyr::pull(gene_id) %>%
unique()
Parsed with column specification:
cols(
gene_symbol = col_character(),
gene_id = col_character(),
transcript_id = col_character()
)
Get PMTL Ensembl identifiers
# filter for gene rows and get ids
pmtl_ids <- readr::read_tsv(file.path(gene_match_input_dir, "PMTL_v3.1.tsv")) %>%
dplyr::filter(Ensembl_ID != "Symbol_Not_Found") %>%
dplyr::pull(Ensembl_ID) %>%
unique()
Parsed with column specification:
cols(
Ensembl_ID = col_character(),
Approved_Symbol = col_character(),
FDA_Designation = col_character(),
FDA_Class = col_character(),
FDA_Target = col_character(),
Mapping_Description = col_character()
)
Get OpenPedCan sample IDs (Kids_First_Biospecimen_ID)
sample_ids <- readr::read_tsv(file.path(data_dir, "histologies.tsv"),
guess_max = 10000) %>%
dplyr::filter(sample_type == "Tumor") %>%
dplyr::select(Kids_First_Biospecimen_ID, cohort)
Parsed with column specification:
cols(
.default = col_character(),
age_at_diagnosis_days = col_double(),
OS_days = col_double(),
EFS_days = col_double(),
age_last_update_days = col_double(),
CNS_region = col_logical(),
normal_fraction = col_double(),
tumor_fraction = col_double(),
tumor_ploidy = col_double(),
age_at_event_days = col_double(),
age_at_chemo_start = col_double(),
age_at_radiation_start = col_double(),
cell_line_composition = col_logical(),
cell_line_passage = col_logical(),
tumor_fraction_RFpurify_ABSOLUTE = col_logical(),
tumor_fraction_RFpurify_ESTIMATE = col_logical(),
tumor_fraction_LUMP = col_logical(),
dkfz_v11_methylation_subclass = col_logical(),
dkfz_v11_methylation_subclass_score = col_logical(),
dkfz_v12_methylation_subclass = col_logical(),
dkfz_v12_methylation_subclass_score = col_logical()
# ... with 2 more columns
)
See spec(...) for full column specifications.
Warning: 22740 parsing failures.
row col expected actual file
19856 CNS_region 1/0/T/F/TRUE/FALSE Hemispheric '/home/rstudio/OpenPedCan-analysis/data/histologies.tsv'
19856 tumor_fraction_RFpurify_ABSOLUTE 1/0/T/F/TRUE/FALSE 0.323379333333333 '/home/rstudio/OpenPedCan-analysis/data/histologies.tsv'
19856 tumor_fraction_RFpurify_ESTIMATE 1/0/T/F/TRUE/FALSE 0.687136805 '/home/rstudio/OpenPedCan-analysis/data/histologies.tsv'
19856 tumor_fraction_LUMP 1/0/T/F/TRUE/FALSE 0.554654468118469 '/home/rstudio/OpenPedCan-analysis/data/histologies.tsv'
19856 dkfz_v11_methylation_subclass 1/0/T/F/TRUE/FALSE LGG_DIG_DIA '/home/rstudio/OpenPedCan-analysis/data/histologies.tsv'
..... ................................ .................. ................. ........................................................
See problems(...) for more details.
Function to filter for required Ensembl gene identifiers and write to file
filter_mtp_table <- function(mtp_table, ensg_ids, file_name) {
if ("Gene_Ensembl_ID" %in% colnames(mtp_table)) {
# filter mtp table and write to file
mtp_table <- mtp_table %>%
dplyr::filter(Gene_Ensembl_ID %in% ensg_ids) %>%
dplyr::mutate(EFO =
case_when(Disease == "Chordoma" ~ "MONDO_0008978",
Disease == "Meningioma" ~ "MONDO_0016642",
Disease == "Perineuroma" ~ "MONDO_0019404",
Disease == "Wilms tumor" ~ "MONDO_0006058",
TRUE ~ EFO)) %>%
dplyr::mutate(MONDO =
case_when(Disease == "Perineuroma" ~ "MONDO_0019404",
TRUE ~ MONDO)) %>%
readr::write_tsv(file.path(output_dir, file_name))
} else {
# filter tsv mtp table
mtp_table <- mtp_table %>%
dplyr::filter(targetFromSourceId %in% ensg_ids) %>%
dplyr::mutate(diseaseFromSourceMappedId =
case_when(Disease == "Chordoma" ~ "MONDO_0008978",
Disease == "Meningioma" ~ "MONDO_0016642",
Disease == "Perineuroma" ~ "MONDO_0019404",
Disease == "Wilms tumor" ~ "MONDO_0006058",
TRUE ~ diseaseFromSourceMappedId)) %>%
dplyr::mutate(MONDO =
case_when(Disease == "Perineuroma" ~ "MONDO_0019404",
TRUE ~ MONDO)) %>%
readr::write_tsv(file.path(output_dir, file_name))
if (grepl("methyl", file_name)) {
rds_file <- paste(unlist(str_split(file_name, "\\."))[1], "rds", sep = ".")
mtp_table %>% readr::write_rds(file.path(output_dir, rds_file), compress = "gz")
}
}
return(mtp_table)
}
# read mtp table
mtp_file <-
file.path(input_dir, "gene-level-snv-consensus-annotated-mut-freq.tsv.gz")
gene_level_snv <-
data.table::fread(mtp_file, sep = "\t", showProgress = FALSE) %>%
tibble::as_tibble()
Registered S3 method overwritten by 'R.oo':
method from
throw.default R.methodsS3
# filter mtp table
file_name <- basename(mtp_file)
filter_mtp_table(gene_level_snv, ensg_ids, file_name)
# Ensembl gene identifiers removed
filtered_ensg <- gene_level_snv %>%
dplyr::filter(!targetFromSourceId %in% ensg_ids) %>%
dplyr::select(Gene_symbol, targetFromSourceId) %>%
dplyr::rename(Ensembl_ID = targetFromSourceId) %>%
dplyr::distinct()
# read the SNV consensus file
consensus_ensg <- data.table::fread(
file.path(data_dir, "snv-consensus-plus-hotspots.maf.tsv.gz"),
select = c("Tumor_Sample_Barcode", "Gene"), showProgress = FALSE) %>%
tibble::as_tibble() %>%
dplyr::filter(!is.na(Gene)) %>%
dplyr::rename(Kids_First_Biospecimen_ID = Tumor_Sample_Barcode) %>%
dplyr::distinct()
# merge sample IDs to filtered ENSG IDs
filtered_ensg <- filtered_ensg %>%
dplyr::left_join(consensus_ensg, by=c("Ensembl_ID" = "Gene"))
# merge OpenPedCan cohorts to filtered ENSG IDs
filtered_ensg <- filtered_ensg %>%
dplyr::left_join(sample_ids, by= "Kids_First_Biospecimen_ID")
# write to file
readr::write_tsv(filtered_ensg,
file.path(results_dir,
"gene-level-snv-consensus-annotated-mut-freq_dropped_ensg.tsv.gz"))
# Display removed ENSEMBL IDs
filtered_ensg %>% dplyr::select(Ensembl_ID, cohort) %>%
dplyr::distinct() %>%
tidyr::drop_na() %>%
table()
cohort
Ensembl_ID GMKF PBTA TARGET
100128124 0 1 0
100128787 1 1 0
100129098 1 1 0
100129307 1 1 0
100129385 0 1 0
100129528 1 1 1
100129603 1 1 1
100130301 1 1 1
100130520 1 1 0
100131107 1 1 1
100131211 1 1 0
100131378 1 1 1
100131539 1 1 1
100132074 0 1 1
100132202 1 1 0
100132247 0 1 0
100133315 1 1 1
100141515 1 1 1
100191040 0 1 1
100286986 1 1 1
100288798 1 1 0
100289255 0 1 0
100302652 1 1 1
100302692 1 1 0
100302736 0 1 1
100310812 0 1 0
100421372 1 1 1
100463487 1 1 0
100499466 1 1 1
100505385 1 1 1
100505498 1 1 0
100505502 1 1 1
100505570 0 1 0
100505583 1 1 0
100505841 1 1 1
100506144 1 1 1
100506376 1 1 1
100506851 1 1 0
100506874 1 1 0
100506974 1 1 0
100507002 1 1 0
100507246 0 1 1
100507527 1 1 1
100507607 1 1 0
100509620 1 1 0
100526664 1 1 1
100526693 0 1 1
100526694 1 1 1
100526740 1 1 1
100526760 0 1 1
100526761 0 1 1
100526767 1 1 1
100526772 0 1 1
100526773 0 1 1
100526783 1 1 1
100526794 1 1 1
100526835 0 1 1
100526842 1 1 1
100527943 0 1 1
100527949 1 1 1
100528017 0 1 1
100528021 1 1 1
100528030 0 1 1
100528032 1 1 1
100529063 0 1 1
100529097 0 1 1
100529144 0 1 1
100529240 1 1 1
100529241 0 1 1
100529257 1 1 0
100529261 0 1 1
100532726 0 1 1
100532731 0 1 1
100532736 1 1 1
100533105 0 1 1
100533106 0 1 1
100533181 0 1 1
100533952 0 1 0
100533997 1 1 1
100534592 0 1 1
100534599 0 1 1
10060 1 1 1
100631383 1 1 1
100652781 1 1 1
100861540 1 1 1
100885850 1 1 1
100996598 1 1 1
100996645 1 1 0
100996648 0 1 1
100996758 1 1 0
100996928 1 0 1
101059906 1 1 0
101059938 0 1 0
101060341 1 1 0
101060588 1 1 0
101060684 1 1 1
10130 0 1 1
10178 1 1 1
101926886 1 1 0
101926942 1 1 0
101927286 0 1 1
101927375 1 1 0
101927423 1 1 0
101927434 1 1 0
101927450 1 1 0
101927468 1 1 0
101927484 1 1 0
101927655 0 1 1
101927668 1 1 0
101927745 1 1 0
101927798 1 1 0
101928047 0 1 0
101928077 1 1 0
101928114 1 1 0
101928268 0 1 0
101928451 1 1 1
101928471 0 1 0
101928570 1 1 0
101928682 1 1 0
101928701 1 1 0
101928778 0 1 0
101928882 1 1 0
101929073 1 1 0
101929084 1 1 0
101929144 1 1 0
101929221 0 1 0
101929309 1 1 0
101929321 1 1 0
101929355 1 1 0
101929356 1 1 0
101929563 1 1 0
101929578 1 1 1
101929628 1 1 0
101929937 0 1 1
101930420 1 1 0
101930434 1 1 0
10245 0 1 1
10251 1 1 1
102546299 1 1 0
102723765 1 1 0
102723789 1 1 0
102723838 1 1 1
102723968 1 1 1
102723971 1 1 0
102723994 1 1 0
102724072 1 1 0
102724081 1 1 0
102724159 1 1 0
102724194 0 1 0
102724265 1 1 0
102724289 1 1 0
102724338 0 1 0
102724380 1 1 0
102724474 1 1 0
102724488 1 1 0
102724533 1 1 0
102724642 1 1 1
102724657 1 1 0
102724849 1 1 0
102724861 1 1 0
102724862 1 1 0
102724971 1 1 0
102725191 1 1 0
102800317 0 1 1
10284 1 1 1
10298 1 1 1
103 0 0 1
10333 0 1 1
10336 1 1 1
10349 1 1 1
103689918 0 1 0
1038 0 1 1
10402 1 1 1
10439 0 0 1
104472715 1 1 1
10529 1 1 1
105369165 1 1 0
105369250 1 1 0
105369333 1 1 0
105369501 1 1 0
105369591 1 1 0
105369617 1 1 0
105369669 1 1 0
105369715 1 1 0
105369793 1 1 0
105369809 1 1 1
105369869 0 1 1
105369917 1 1 1
105369958 1 1 0
105370025 1 1 0
105370057 0 1 0
105370084 1 1 0
105370130 0 1 0
105370175 1 1 0
105370214 1 1 0
105370236 1 1 0
105370289 0 1 0
105370299 1 1 0
105370314 1 1 0
105370343 1 1 0
105370344 1 1 0
105370361 1 1 0
105370397 1 1 0
105370470 1 1 0
105370475 1 1 0
105370476 1 1 0
105370491 1 1 1
105370507 1 1 0
105370593 1 1 0
105370656 1 1 0
105370673 1 1 0
105370698 1 1 0
105370706 1 1 0
105370739 1 1 0
105370980 1 1 1
105371063 0 1 0
105371233 1 1 0
105371235 1 1 0
105371253 1 1 0
105371278 1 1 0
105371419 1 1 1
105371695 1 1 0
105371811 1 1 0
105371948 0 1 0
105371974 1 1 0
105372019 1 1 0
105372043 1 1 0
105372157 1 1 0
105372186 1 1 0
105372204 1 1 0
105372267 1 1 0
105372358 1 1 0
105372362 1 1 0
105372371 1 1 0
105372399 1 1 0
105372528 1 1 0
105372532 1 1 0
105372562 1 1 0
105372620 1 1 0
105372652 1 1 0
105372704 0 1 0
105372893 0 1 0
105372916 1 1 0
105372923 1 1 0
105372952 1 1 0
105373057 1 1 1
105373091 1 1 0
105373163 1 1 0
105373220 1 1 0
105373261 1 1 0
105373273 1 1 0
105373297 1 1 0
105373346 1 1 0
105373444 1 1 0
105373523 1 1 0
105373553 1 1 0
105373598 1 1 0
105373652 1 1 0
105373727 1 1 0
105373782 1 1 0
105373836 1 1 0
105373893 1 1 0
105373896 1 1 0
105373911 1 1 0
105373941 1 1 0
105373989 1 1 0
105374016 1 1 0
105374033 1 1 0
105374056 1 1 0
105374060 1 1 1
105374094 0 1 0
105374103 1 1 0
105374122 1 1 0
105374138 1 1 0
105374160 1 1 0
105374166 1 1 0
105374301 1 1 0
105374391 1 1 0
105374413 1 1 0
105374483 1 1 0
105374492 1 1 0
105374494 1 1 0
105374525 1 1 0
105374528 1 1 0
105374549 1 1 0
105374594 1 1 0
105374627 1 1 0
105374654 1 1 0
105374769 1 1 0
105374831 1 1 0
105374855 1 1 1
105374942 1 1 0
105374956 1 1 0
105374958 1 1 0
105374970 0 1 0
105375021 1 1 1
105375036 1 1 0
105375107 0 1 0
105375116 1 1 0
105375237 1 1 0
105375278 1 1 0
105375282 1 1 0
105375291 1 1 0
105375299 1 1 0
105375322 1 1 0
105375324 1 1 0
105375337 1 1 0
105375630 1 1 0
105375646 1 1 0
105375670 1 1 0
105375724 1 1 1
105375742 1 1 0
105375749 1 1 0
105375795 0 0 1
105375817 1 1 0
105375851 1 1 0
105375906 1 1 0
105375925 1 1 0
105375937 1 1 0
105375993 1 1 0
105376083 1 1 0
105376084 1 1 0
105376123 1 1 0
105376126 1 1 0
105376135 0 1 0
105376140 0 1 0
105376147 1 1 1
105376166 1 1 0
105376191 1 1 0
105376194 1 1 0
105376244 1 1 0
105376335 1 1 0
105376341 1 1 0
105376367 1 1 0
105376400 1 1 0
105376454 1 1 0
105376625 0 1 0
105376636 1 1 0
105376661 1 1 0
105376714 1 1 1
105376735 1 1 0
105376875 1 1 0
105376876 1 1 0
105377253 1 1 0
105377278 1 1 0
105377296 1 1 0
105377308 1 1 1
105377310 1 1 0
105377350 1 1 0
105377436 1 1 0
105377447 0 1 0
105377513 1 1 0
105377579 0 1 0
105377622 1 1 0
105377663 1 1 0
105377684 1 1 0
105377699 1 1 0
105377705 1 1 0
105377732 1 1 0
105377742 1 1 0
105377781 1 1 1
105377793 1 1 0
105377796 1 1 0
105377805 1 1 1
105377841 1 1 0
105377913 1 1 0
105377959 1 1 0
105378013 1 1 0
105378021 1 1 0
105378071 1 1 0
105378102 1 1 0
105378148 1 1 0
105378193 1 1 0
105378239 1 1 0
105378314 1 1 0
105378407 1 1 0
105378415 1 1 0
105378464 1 1 0
105378532 1 1 1
105378612 1 1 0
105378621 1 1 0
105378644 1 1 0
105378712 1 1 0
105378771 1 1 0
105378776 1 1 0
105378789 1 1 0
105378802 1 1 0
105378849 1 1 0
105378947 1 1 0
105378979 1 1 0
105379100 1 1 1
105379198 1 1 0
105379315 1 1 0
105379337 1 1 0
105379752 1 1 0
10658 1 1 0
10665 1 1 1
10677 1 1 1
107181291 1 1 0
10723 1 1 1
10739 1 1 1
107983958 1 1 1
107983971 1 1 0
107983981 1 1 1
107983989 1 1 0
107983993 0 1 0
107984005 1 1 0
107984014 1 1 0
107984025 1 1 0
107984083 1 1 0
107984124 1 1 1
107984128 1 1 0
107984175 1 1 0
107984186 1 1 0
107984189 1 1 0
107984203 1 1 0
107984207 1 1 0
107984222 1 1 0
107984264 1 1 0
107984330 1 1 0
107984351 1 1 0
107984449 1 1 0
107984512 0 1 0
107984590 1 1 0
107984638 1 1 0
107984654 1 1 0
107984674 1 1 0
107984817 1 1 0
107984832 1 1 0
107984833 1 1 0
107984862 1 1 0
107984876 1 1 0
107984974 1 1 0
107985021 1 1 0
107985022 0 1 0
107985149 1 1 1
107985178 1 1 0
107985208 0 1 0
107985251 1 1 0
107985385 1 1 0
107985508 1 1 0
107985556 1 1 1
107985637 1 1 0
107985652 1 1 0
107985678 0 1 1
107985729 1 1 0
107985773 1 1 1
107985792 1 1 0
107985795 0 1 0
107985805 0 1 0
107985836 1 1 0
107985866 1 1 0
107985876 1 1 0
107985888 0 1 0
107985924 1 1 0
107985962 1 1 0
107985976 1 1 0
107986022 1 1 1
107986217 1 1 1
107986223 1 1 0
107986265 1 1 1
107986306 1 1 0
107986322 1 1 1
107986332 1 1 0
107986388 1 1 0
107986583 1 1 0
107986617 1 1 0
107986619 0 1 0
107986635 1 1 0
107986668 0 1 0
107986744 1 1 0
107986762 1 1 0
107986806 1 1 1
107986815 1 1 0
107986827 1 1 1
107986902 1 1 0
107986913 1 1 0
107986982 1 1 0
107987042 1 1 0
107987044 0 1 0
107987054 1 1 0
107987067 1 1 0
107987125 1 1 0
107987211 1 1 1
107987238 0 1 0
107987243 1 1 0
107987248 1 1 0
107987269 0 1 0
107987285 1 1 0
10810 1 1 0
10861 1 1 1
109729166 1 1 0
110116772 1 1 1
110117499 0 1 1
11045 1 1 1
110599563 0 1 1
110599583 0 1 1
110806280 1 1 1
110806290 1 1 0
111089941 1 1 1
11122 1 1 1
11148 0 1 1
11158 0 1 1
11170 0 1 1
112267855 1 1 0
112267879 1 1 0
112267897 1 1 0
112267904 1 1 1
112267968 1 1 0
112268052 0 1 0
112268076 1 1 0
112268092 1 1 0
112268119 1 1 0
112268145 1 1 0
112268168 1 1 0
112268186 1 1 0
112268444 1 1 0
11243 0 1 1
112577516 1 1 1
112577592 1 1 0
112694756 1 1 1
113130 1 1 1
11346 1 1 1
114659 1 1 1
114769 0 1 1
114780 1 1 1
114825 1 1 1
114827 1 1 1
114841035 0 1 1
114991 1 1 1
115072896 1 1 1
116039 0 1 1
116804918 1 1 1
118142757 1 1 1
1192 0 1 1
120146 1 1 1
120376 0 1 1
121601 1 1 1
122651 1 1 1
1234 1 1 1
1235 1 1 1
126204 1 1 1
127943 1 1 1
128853 0 1 1
132001 1 1 0
133688 0 1 0
134860 0 1 1
135138 0 1 1
136332 1 1 1
137196 1 1 0
137682 1 1 1
138241 1 1 0
138639 1 1 1
138652 1 1 1
138715 0 1 1
139604 0 1 1
140686 1 1 0
140699 1 1 1
140731 1 1 1
140862 1 1 0
141 1 1 1
1432 1 1 0
143501 1 1 1
143502 0 1 1
1438 1 1 0
144203 1 1 1
1448 1 1 1
144983 1 1 1
1456 1 1 1
146227 1 1 1
146433 1 1 1
146713 1 1 1
146923 0 1 0
147 1 1 0
147166 1 1 1
148 1 1 0
148137 0 1 1
148231 1 1 1
148345 0 1 0
1488 1 1 0
149013 1 1 1
149708 0 1 1
150084 1 1 1
150681 1 1 1
150684 1 1 1
150763 0 1 1
151194 1 1 1
151648 0 1 1
1523 0 1 1
152404 0 0 1
152742 1 1 0
152815 1 1 1
152940 1 1 0
153090 1 1 1
155061 1 1 1
155184 1 1 0
157567 1 1 1
157697 1 1 1
158046 1 1 0
158055 1 1 1
158358 1 1 0
158435 1 1 0
158931 0 1 1
159091 1 1 0
159371 1 1 0
1600 1 1 0
160365 1 1 1
160492 1 1 0
160857 1 1 0
161142 1 1 1
162966 0 1 1
162972 1 1 1
1638 1 1 0
1646 1 1 1
165679 1 1 1
165721 0 1 1
166929 1 1 1
168975 0 0 1
169355 0 0 1
169834 1 1 1
169966 1 1 1
170082 1 1 1
170961 0 1 1
1767 0 1 0
1780 1 1 1
1800 0 1 1
1936 1 1 1
1998 0 1 1
199990 1 1 1
200008 0 1 1
200172 1 1 1
2009 0 1 1
200959 1 1 1
201516 1 1 1
201625 1 1 1
2036 1 1 0
2078 1 1 1
2104 1 1 0
2187 1 1 1
219595 1 1 1
219958 0 1 1
2205 1 1 1
221301 1 1 1
221357 1 1 1
221946 1 1 0
222611 1 1 1
222967 1 1 0
2258 1 1 1
2259 0 1 1
22797 0 1 1
22829 1 1 1
22859 1 1 1
22871 1 1 1
22903 0 1 0
22950 1 1 1
22987 1 1 0
22995 1 1 1
22998 0 1 1
23026 1 1 0
23031 1 1 1
23089 1 1 1
23143 1 1 1
23199 1 1 1
23243 0 1 1
23247 1 1 1
23254 0 1 1
23263 0 1 1
23271 1 1 1
23353 0 1 1
23362 0 1 1
23426 0 0 1
23499 1 1 1
23520 1 1 1
23539 0 0 1
23543 1 1 1
23567 0 1 0
23761 0 1 1
245913 0 1 1
246175 1 1 1
2491 1 1 1
25 1 1 1
2542 0 1 1
254272 1 1 0
254773 1 1 1
2549 0 1 1
256076 1 1 1
257313 1 1 1
25769 1 1 1
25776 1 1 1
25777 1 1 1
25822 1 1 1
25941 0 1 1
25945 1 1 1
25979 1 1 1
26013 1 1 1
260293 1 1 1
26083 1 1 1
26147 0 1 0
26231 1 1 1
26272 1 1 0
26468 1 1 0
26609 1 1 1
267012 0 1 1
27077 1 1 1
27094 1 1 1
27097 1 1 1
27098 0 1 1
27141 1 1 1
27180 1 1 0
28 1 1 1
280664 0 0 1
28231 1 1 0
2825 1 1 1
282966 0 1 1
282973 1 1 1
282974 1 1 1
283463 1 1 1
283694 0 0 1
283848 1 1 1
283982 1 1 1
284361 1 1 0
284371 0 1 1
284498 1 1 1
284521 1 1 1
285346 1 1 1
285362 1 1 1
285498 1 1 1
285555 1 1 1
285834 1 1 0
2859 1 1 1
286 0 1 1
286187 0 1 1
286223 1 1 1
286749 1 1 1
287 1 1 1
2873 1 1 1
29058 1 1 1
2914 0 1 1
29850 0 1 1
29914 1 1 1
29990 1 1 1
29998 1 1 1
3012 0 1 1
3017 0 1 1
30820 0 1 1
3084 1 1 1
3164 1 1 1
3172 1 1 1
317703 1 1 0
3187 1 1 0
324 1 0 1
3320 0 1 1
3338 0 1 1
338872 1 1 1
338879 1 1 1
339501 0 1 0
339512 1 1 1
339766 1 1 1
340069 0 1 1
340094 0 1 1
340205 0 1 1
340441 1 1 1
340526 1 1 1
340595 1 1 1
341346 1 1 1
342931 1 1 1
343068 1 1 1
3441 0 1 1
344892 1 1 1
345062 0 1 1
347169 1 1 1
347475 1 1 1
348751 1 1 1
353149 0 1 1
3556 1 1 1
3581 0 1 0
359710 0 1 1
359845 1 1 1
360226 0 1 1
3609 0 1 1
3655 0 1 1
3699 0 1 1
3712 0 1 1
373856 1 1 1
3742 1 1 1
374286 0 1 0
374308 1 1 1
374462 1 1 1
374864 1 1 1
374900 0 1 1
374907 0 1 1
374986 1 1 1
375449 1 1 1
375616 0 1 0
375748 1 1 1
375757 0 1 1
3778 1 1 1
3801 0 0 1
3805 0 1 1
3809 1 1 1
3811 1 1 1
3831 0 1 1
387104 1 1 1
387263 1 1 1
387522 1 1 1
387723 1 1 0
388199 1 1 1
388276 0 1 1
388324 1 1 1
388677 1 1 1
388685 1 1 0
388761 1 1 1
388795 1 1 1
388960 0 1 1
389197 1 1 1
389199 0 1 1
389690 1 1 1
389895 1 1 0
390226 1 1 1
390748 1 1 1
390877 0 1 1
391003 1 1 1
392490 1 1 1
392617 1 1 1
3983 1 1 1
400073 1 1 1
400087 1 1 0
400499 1 1 1
400533 1 1 0
400620 1 1 0
400797 1 1 0
400818 1 1 1
400986 1 1 1
401067 1 1 1
401082 1 1 1
401115 1 1 1
401149 1 1 0
401237 1 1 1
401399 1 1 1
401497 0 1 0
401992 1 1 1
401994 1 1 1
402160 1 1 1
4033 0 1 0
403323 1 1 0
403341 1 1 1
404734 1 1 1
407977 0 1 1
408186 1 1 1
414061 0 1 1
4168 1 1 1
4174 1 1 0
433 0 1 1
439915 0 1 1
440348 1 1 1
440352 1 1 1
440353 1 1 1
440556 1 1 1
440689 1 1 1
440896 1 1 0
440993 1 1 1
441108 0 0 1
441155 1 1 1
441239 0 1 0
441251 1 1 1
441452 1 1 1
441459 1 1 1
441525 1 1 1
441631 1 1 0
441669 1 1 0
441911 1 1 1
442179 1 1 0
445347 1 1 1
445372 0 1 1
445815 1 1 1
448834 1 1 1
4583 1 1 1
463 1 1 1
4636 1 1 1
474354 1 1 1
4802 0 1 1
4883 0 0 1
493754 1 1 0
493901 0 1 1
4946 0 1 1
4948 1 1 0
4957 1 1 1
4988 0 1 1
4992 1 1 1
4995 1 1 1
503834 1 1 1
50514 1 1 1
50636 0 1 1
50650 0 1 1
5099 1 1 1
51031 1 1 1
51043 1 1 1
51088 1 1 0
51207 0 1 1
51265 1 1 0
51266 1 1 1
51351 1 1 1
5136 1 1 1
5137 1 1 1
5143 1 1 1
51435 1 1 0
5144 0 1 1
51466 1 1 0
5154 0 1 0
51557 1 1 0
51686 1 1 1
5217 0 1 1
5271 1 1 0
5308 0 1 1
5320 0 1 1
53354 1 1 1
5339 0 1 1
53405 1 1 1
53616 0 0 1
53632 1 1 1
54097 1 1 1
5430 1 1 1
54331 1 1 1
54332 0 1 0
5437 0 1 1
54536 0 0 1
54585 0 0 1
5475 1 1 1
54825 1 1 1
548644 1 1 1
54914 1 1 1
5495 1 1 1
54970 0 1 0
54998 0 1 1
55 1 1 1
55041 1 1 1
55179 0 1 1
5522 0 1 1
55267 1 1 1
5527 1 1 1
55284 0 1 1
552891 0 1 1
55297 0 1 1
553158 0 1 1
55324 1 1 1
55350 0 1 1
55384 1 1 1
55507 1 1 1
55521 0 1 1
55567 1 1 1
55672 1 1 1
55715 0 1 1
55734 0 1 1
55769 1 1 1
55793 1 1 1
55809 1 1 1
55819 1 1 1
55841 1 1 1
55959 1 1 1
55964 1 1 1
56165 1 1 1
56341 0 1 1
5646 0 1 1
56606 1 1 0
56658 0 1 1
56901 0 1 1
56907 1 1 1
56911 1 1 1
56918 1 1 1
56924 0 1 1
56938 1 1 1
56955 1 1 1
5697 1 1 1
56971 1 1 1
56977 0 1 0
57002 0 1 1
57047 1 1 1
57093 1 1 0
57153 1 1 1
57180 1 1 0
5733 1 1 1
57451 1 1 1
57480 1 1 1
57489 1 1 1
57529 1 1 1
57571 1 1 1
57583 1 1 1
57624 1 1 1
57642 1 1 1
57644 0 0 1
57674 0 1 1
57690 1 1 1
57705 1 1 1
57821 1 1 1
57835 1 1 1
5900 0 0 1
5906 1 1 1
5983 1 1 1
5988 1 1 1
5998 1 1 1
6006 1 1 1
60312 1 1 1
60437 1 1 1
60487 1 1 0
60490 1 1 0
60506 1 1 1
6092 1 1 1
6101 1 1 1
613037 0 1 0
613211 1 1 1
613227 1 1 1
6196 0 1 1
6276 0 1 1
631 1 1 1
6370 0 1 1
64067 1 1 1
64097 1 1 1
641517 1 1 1
642484 1 1 1
642515 0 1 1
642612 1 1 1
642799 0 1 0
642819 1 1 1
643311 0 1 1
643314 0 1 1
643376 1 1 1
643382 1 1 1
64342 1 1 0
643723 1 1 0
644100 1 1 1
645191 1 1 1
645202 1 1 1
645369 1 1 1
645414 1 1 1
645425 1 1 1
64582 1 1 1
646262 1 1 1
646588 1 1 0
646915 1 1 0
647264 1 1 0
64756 0 1 1
64770 1 1 1
64860 0 1 1
650226 1 1 0
65109 0 1 0
65249 0 1 0
65265 0 1 0
653061 1 1 1
653149 1 1 1
653857 1 1 0
654231 1 1 1
65982 0 0 1
65983 0 1 1
6675 0 1 1
6698 1 1 1
6733 1 1 0
6757 0 0 1
6788 1 1 1
6867 0 1 1
692312 0 1 1
7005 1 1 1
7037 1 1 1
7082 0 1 1
7089 0 1 1
7182 0 1 1
7275 1 1 1
727800 0 1 1
728194 1 1 0
728392 1 1 1
728441 1 1 1
728588 1 1 1
728763 1 1 1
728888 1 1 1
728957 1 1 1
728989 1 1 1
729218 1 1 0
729262 0 1 0
729359 1 1 1
729428 1 1 1
729767 0 1 1
729978 1 1 1
730005 0 0 1
730098 1 1 1
731275 1 1 1
7402 1 1 1
7417 0 1 1
7422 1 0 1
744 1 1 1
7466 1 1 0
7494 0 1 1
7547 0 0 1
7587 0 1 1
7594 0 1 0
7644 1 1 0
767558 0 0 1
7739 1 1 1
7757 0 1 0
7767 1 1 1
777 0 1 0
7771 0 1 1
7775 1 1 1
7798 1 1 1
79015 1 1 0
79020 1 1 1
79037 0 1 1
79081 0 1 1
79091 1 1 0
79470 1 1 1
79589 0 1 1
79676 1 1 1
79750 1 1 0
79784 0 1 0
79917 0 1 1
79921 0 1 1
79931 0 1 1
79949 0 1 1
80036 0 0 1
80095 1 1 1
80125 1 1 1
80254 0 1 1
80728 1 1 1
80741 0 1 1
80817 1 1 1
80833 1 1 1
80854 0 1 1
81328 1 1 1
81458 1 1 1
81889 0 1 1
8225 0 1 1
83259 1 1 1
83463 0 1 1
8347 1 1 1
83551 0 1 1
84253 0 0 1
84309 1 1 1
84337 0 1 1
84460 0 1 1
8453 0 1 0
84614 1 1 1
8470 1 1 1
84725 0 1 1
84750 1 1 0
84765 1 1 1
84790 0 1 1
84808 1 1 1
84851 1 1 1
84876 0 1 1
84924 1 1 1
84985 1 1 1
8525 1 1 1
85358 1 1 1
85395 0 1 1
85397 1 1 1
85439 1 1 1
85441 1 1 1
85452 1 1 1
8638 0 1 0
8668 0 1 1
8681 1 1 1
8711 0 1 1
8785 1 1 1
8801 0 1 1
8857 1 1 1
89780 1 1 0
89795 0 1 1
89796 1 1 1
8996 0 1 1
90007 1 1 1
90019 1 1 1
90050 1 1 1
90113 1 1 1
90293 1 1 1
9033 1 1 0
90362 1 1 1
9038 1 1 1
90416 1 1 1
90417 0 1 1
90523 1 1 0
90850 1 1 1
91227 0 1 0
91526 0 1 0
9154 1 1 1
91544 1 1 1
91646 1 1 1
91833 1 1 1
91869 1 1 0
92292 1 1 1
9274 0 1 1
92806 1 1 1
92949 0 1 0
93594 1 1 0
94025 1 1 0
94121 1 1 1
9425 0 1 1
9478 1 1 1
9498 0 1 1
9534 1 1 0
9612 1 1 1
9628 1 1 0
9645 1 1 1
9651 1 1 1
9686 1 1 1
9693 1 1 1
9721 1 1 1
9734 1 1 1
9738 1 1 1
9745 1 1 0
9758 0 1 0
9887 1 1 1
9906 1 1 1
9922 0 1 0
9942 1 1 0
9994 1 1 1
ENSG00000112096 1 1 1
ENSG00000182230 1 1 1
ENSG00000215271 1 1 1
ENSG00000221995 1 1 1
ENSG00000249860 1 1 1
ENSG00000256045 1 1 1
ENSG00000256222 1 1 1
ENSG00000256892 1 1 0
ENSG00000269028 1 1 0
ENSG00000270188 1 1 0
ENSG00000270394 1 1 0
# Display samples in removed ENSEMBL IDs that are in PMTL list
filtered_ensg %>% dplyr::filter(Ensembl_ID %in% pmtl_ids) %>%
select(Ensembl_ID, Kids_First_Biospecimen_ID) %>%
dplyr::distinct()
# Remove data frames
rm(gene_level_snv, filtered_ensg)
# read mtp table
mtp_file <-
file.path(input_dir, "variant-level-snv-consensus-annotated-mut-freq.tsv.gz")
variant_level_snv <-
data.table::fread(mtp_file, sep = "\t", showProgress = FALSE) %>%
tibble::as_tibble()
# filter mtp table
file_name <- basename(mtp_file)
filter_mtp_table(variant_level_snv, ensg_ids, file_name)
# Ensembl gene identifiers removed
filtered_ensg <- variant_level_snv %>%
dplyr::filter(!targetFromSourceId %in% ensg_ids) %>%
dplyr::select(Gene_symbol, targetFromSourceId) %>%
dplyr::rename(Ensembl_ID = targetFromSourceId) %>%
dplyr::distinct()
# merge sample IDs to filtered ENSG IDs
filtered_ensg <- filtered_ensg %>%
dplyr::left_join(consensus_ensg, by=c("Ensembl_ID" = "Gene"))
# merge OpenPedCan cohorts to filtered ENSG IDs
filtered_ensg <- filtered_ensg %>%
dplyr::left_join(sample_ids, by= "Kids_First_Biospecimen_ID")
# write to file
readr::write_tsv(filtered_ensg,
file.path(results_dir,
"variant-level-snv-consensus-annotated-mut-freq_dropped_ensg.tsv.gz"))
# Display removed ENSEMBL IDs
filtered_ensg %>% dplyr::select(Ensembl_ID, cohort) %>%
dplyr::distinct() %>%
tidyr::drop_na() %>%
table()
cohort
Ensembl_ID GMKF PBTA TARGET
100128124 0 1 0
100128787 1 1 0
100129098 1 1 0
100129307 1 1 0
100129385 0 1 0
100129528 1 1 1
100129603 1 1 1
100130301 1 1 1
100130520 1 1 0
100131107 1 1 1
100131211 1 1 0
100131378 1 1 1
100131539 1 1 1
100132074 0 1 1
100132202 1 1 0
100132247 0 1 0
100133315 1 1 1
100141515 1 1 1
100191040 0 1 1
100286986 1 1 1
100288798 1 1 0
100289255 0 1 0
100302652 1 1 1
100302692 1 1 0
100302736 0 1 1
100310812 0 1 0
100421372 1 1 1
100463487 1 1 0
100499466 1 1 1
100505385 1 1 1
100505498 1 1 0
100505502 1 1 1
100505570 0 1 0
100505583 1 1 0
100505841 1 1 1
100506144 1 1 1
100506376 1 1 1
100506851 1 1 0
100506874 1 1 0
100506974 1 1 0
100507002 1 1 0
100507246 0 1 1
100507527 1 1 1
100507607 1 1 0
100509620 1 1 0
100526664 1 1 1
100526693 0 1 1
100526694 1 1 1
100526740 1 1 1
100526760 0 1 1
100526761 0 1 1
100526767 1 1 1
100526772 0 1 1
100526773 0 1 1
100526783 1 1 1
100526794 1 1 1
100526835 0 1 1
100526842 1 1 1
100527943 0 1 1
100527949 1 1 1
100528017 0 1 1
100528021 1 1 1
100528030 0 1 1
100528032 1 1 1
100529063 0 1 1
100529097 0 1 1
100529144 0 1 1
100529240 1 1 1
100529241 0 1 1
100529257 1 1 0
100529261 0 1 1
100532726 0 1 1
100532731 0 1 1
100532736 1 1 1
100533105 0 1 1
100533106 0 1 1
100533181 0 1 1
100533952 0 1 0
100533997 1 1 1
100534592 0 1 1
100534599 0 1 1
10060 1 1 1
100631383 1 1 1
100652781 1 1 1
100861540 1 1 1
100885850 1 1 1
100996598 1 1 1
100996645 1 1 0
100996648 0 1 1
100996758 1 1 0
100996928 1 0 1
101059906 1 1 0
101059938 0 1 0
101060341 1 1 0
101060588 1 1 0
101060684 1 1 1
10130 0 1 1
10178 1 1 1
101926886 1 1 0
101926942 1 1 0
101927286 0 1 1
101927375 1 1 0
101927423 1 1 0
101927434 1 1 0
101927450 1 1 0
101927468 1 1 0
101927484 1 1 0
101927655 0 1 1
101927668 1 1 0
101927745 1 1 0
101927798 1 1 0
101928047 0 1 0
101928077 1 1 0
101928114 1 1 0
101928268 0 1 0
101928451 1 1 1
101928471 0 1 0
101928570 1 1 0
101928682 1 1 0
101928701 1 1 0
101928778 0 1 0
101928882 1 1 0
101929073 1 1 0
101929084 1 1 0
101929144 1 1 0
101929221 0 1 0
101929309 1 1 0
101929321 1 1 0
101929355 1 1 0
101929356 1 1 0
101929563 1 1 0
101929578 1 1 1
101929628 1 1 0
101929937 0 1 1
101930420 1 1 0
101930434 1 1 0
10245 0 1 1
10251 1 1 1
102546299 1 1 0
102723765 1 1 0
102723789 1 1 0
102723838 1 1 1
102723968 1 1 1
102723971 1 1 0
102723994 1 1 0
102724072 1 1 0
102724081 1 1 0
102724159 1 1 0
102724194 0 1 0
102724265 1 1 0
102724289 1 1 0
102724338 0 1 0
102724380 1 1 0
102724474 1 1 0
102724488 1 1 0
102724533 1 1 0
102724642 1 1 1
102724657 1 1 0
102724849 1 1 0
102724861 1 1 0
102724862 1 1 0
102724971 1 1 0
102725191 1 1 0
102800317 0 1 1
10284 1 1 1
10298 1 1 1
103 0 0 1
10333 0 1 1
10336 1 1 1
10349 1 1 1
103689918 0 1 0
1038 0 1 1
10402 1 1 1
10439 0 0 1
104472715 1 1 1
10529 1 1 1
105369165 1 1 0
105369250 1 1 0
105369333 1 1 0
105369501 1 1 0
105369591 1 1 0
105369617 1 1 0
105369669 1 1 0
105369715 1 1 0
105369793 1 1 0
105369809 1 1 1
105369869 0 1 1
105369917 1 1 1
105369958 1 1 0
105370025 1 1 0
105370057 0 1 0
105370084 1 1 0
105370130 0 1 0
105370175 1 1 0
105370214 1 1 0
105370236 1 1 0
105370289 0 1 0
105370299 1 1 0
105370314 1 1 0
105370343 1 1 0
105370344 1 1 0
105370361 1 1 0
105370397 1 1 0
105370470 1 1 0
105370475 1 1 0
105370476 1 1 0
105370491 1 1 1
105370507 1 1 0
105370593 1 1 0
105370656 1 1 0
105370673 1 1 0
105370698 1 1 0
105370706 1 1 0
105370739 1 1 0
105370980 1 1 1
105371063 0 1 0
105371233 1 1 0
105371235 1 1 0
105371253 1 1 0
105371278 1 1 0
105371419 1 1 1
105371695 1 1 0
105371811 1 1 0
105371948 0 1 0
105371974 1 1 0
105372019 1 1 0
105372043 1 1 0
105372157 1 1 0
105372186 1 1 0
105372204 1 1 0
105372267 1 1 0
105372358 1 1 0
105372362 1 1 0
105372371 1 1 0
105372399 1 1 0
105372528 1 1 0
105372532 1 1 0
105372562 1 1 0
105372620 1 1 0
105372652 1 1 0
105372704 0 1 0
105372893 0 1 0
105372916 1 1 0
105372923 1 1 0
105372952 1 1 0
105373057 1 1 1
105373091 1 1 0
105373163 1 1 0
105373220 1 1 0
105373261 1 1 0
105373273 1 1 0
105373297 1 1 0
105373346 1 1 0
105373444 1 1 0
105373523 1 1 0
105373553 1 1 0
105373598 1 1 0
105373652 1 1 0
105373727 1 1 0
105373782 1 1 0
105373836 1 1 0
105373893 1 1 0
105373896 1 1 0
105373911 1 1 0
105373941 1 1 0
105373989 1 1 0
105374016 1 1 0
105374033 1 1 0
105374056 1 1 0
105374060 1 1 1
105374094 0 1 0
105374103 1 1 0
105374122 1 1 0
105374138 1 1 0
105374160 1 1 0
105374166 1 1 0
105374301 1 1 0
105374391 1 1 0
105374413 1 1 0
105374483 1 1 0
105374492 1 1 0
105374494 1 1 0
105374525 1 1 0
105374528 1 1 0
105374549 1 1 0
105374594 1 1 0
105374627 1 1 0
105374654 1 1 0
105374769 1 1 0
105374831 1 1 0
105374855 1 1 1
105374942 1 1 0
105374956 1 1 0
105374958 1 1 0
105374970 0 1 0
105375021 1 1 1
105375036 1 1 0
105375107 0 1 0
105375116 1 1 0
105375237 1 1 0
105375278 1 1 0
105375282 1 1 0
105375291 1 1 0
105375299 1 1 0
105375322 1 1 0
105375324 1 1 0
105375337 1 1 0
105375630 1 1 0
105375646 1 1 0
105375670 1 1 0
105375724 1 1 1
105375742 1 1 0
105375749 1 1 0
105375795 0 0 1
105375817 1 1 0
105375851 1 1 0
105375906 1 1 0
105375925 1 1 0
105375937 1 1 0
105375993 1 1 0
105376083 1 1 0
105376084 1 1 0
105376123 1 1 0
105376126 1 1 0
105376135 0 1 0
105376140 0 1 0
105376147 1 1 1
105376166 1 1 0
105376191 1 1 0
105376194 1 1 0
105376244 1 1 0
105376335 1 1 0
105376341 1 1 0
105376367 1 1 0
105376400 1 1 0
105376454 1 1 0
105376625 0 1 0
105376636 1 1 0
105376661 1 1 0
105376714 1 1 1
105376735 1 1 0
105376875 1 1 0
105376876 1 1 0
105377253 1 1 0
105377278 1 1 0
105377296 1 1 0
105377308 1 1 1
105377310 1 1 0
105377350 1 1 0
105377436 1 1 0
105377447 0 1 0
105377513 1 1 0
105377579 0 1 0
105377622 1 1 0
105377663 1 1 0
105377684 1 1 0
105377699 1 1 0
105377705 1 1 0
105377732 1 1 0
105377742 1 1 0
105377781 1 1 1
105377793 1 1 0
105377796 1 1 0
105377805 1 1 1
105377841 1 1 0
105377913 1 1 0
105377959 1 1 0
105378013 1 1 0
105378021 1 1 0
105378071 1 1 0
105378102 1 1 0
105378148 1 1 0
105378193 1 1 0
105378239 1 1 0
105378314 1 1 0
105378407 1 1 0
105378415 1 1 0
105378464 1 1 0
105378532 1 1 1
105378612 1 1 0
105378621 1 1 0
105378644 1 1 0
105378712 1 1 0
105378771 1 1 0
105378776 1 1 0
105378789 1 1 0
105378802 1 1 0
105378849 1 1 0
105378947 1 1 0
105378979 1 1 0
105379100 1 1 1
105379198 1 1 0
105379315 1 1 0
105379337 1 1 0
105379752 1 1 0
10658 1 1 0
10665 1 1 1
10677 1 1 1
107181291 1 1 0
10723 1 1 1
10739 1 1 1
107983958 1 1 1
107983971 1 1 0
107983981 1 1 1
107983989 1 1 0
107983993 0 1 0
107984005 1 1 0
107984014 1 1 0
107984025 1 1 0
107984083 1 1 0
107984124 1 1 1
107984128 1 1 0
107984175 1 1 0
107984186 1 1 0
107984189 1 1 0
107984203 1 1 0
107984207 1 1 0
107984222 1 1 0
107984264 1 1 0
107984330 1 1 0
107984351 1 1 0
107984449 1 1 0
107984512 0 1 0
107984590 1 1 0
107984638 1 1 0
107984654 1 1 0
107984674 1 1 0
107984817 1 1 0
107984832 1 1 0
107984833 1 1 0
107984862 1 1 0
107984876 1 1 0
107984974 1 1 0
107985021 1 1 0
107985022 0 1 0
107985149 1 1 1
107985178 1 1 0
107985208 0 1 0
107985251 1 1 0
107985385 1 1 0
107985508 1 1 0
107985556 1 1 1
107985637 1 1 0
107985652 1 1 0
107985678 0 1 1
107985729 1 1 0
107985773 1 1 1
107985792 1 1 0
107985795 0 1 0
107985805 0 1 0
107985836 1 1 0
107985866 1 1 0
107985876 1 1 0
107985888 0 1 0
107985924 1 1 0
107985962 1 1 0
107985976 1 1 0
107986022 1 1 1
107986217 1 1 1
107986223 1 1 0
107986265 1 1 1
107986306 1 1 0
107986322 1 1 1
107986332 1 1 0
107986388 1 1 0
107986583 1 1 0
107986617 1 1 0
107986619 0 1 0
107986635 1 1 0
107986668 0 1 0
107986744 1 1 0
107986762 1 1 0
107986806 1 1 1
107986815 1 1 0
107986827 1 1 1
107986902 1 1 0
107986913 1 1 0
107986982 1 1 0
107987042 1 1 0
107987044 0 1 0
107987054 1 1 0
107987067 1 1 0
107987125 1 1 0
107987211 1 1 1
107987238 0 1 0
107987243 1 1 0
107987248 1 1 0
107987269 0 1 0
107987285 1 1 0
10810 1 1 0
10861 1 1 1
109729166 1 1 0
110116772 1 1 1
110117499 0 1 1
11045 1 1 1
110599563 0 1 1
110599583 0 1 1
110806280 1 1 1
110806290 1 1 0
111089941 1 1 1
11122 1 1 1
11148 0 1 1
11158 0 1 1
11170 0 1 1
112267855 1 1 0
112267879 1 1 0
112267897 1 1 0
112267904 1 1 1
112267968 1 1 0
112268052 0 1 0
112268076 1 1 0
112268092 1 1 0
112268119 1 1 0
112268145 1 1 0
112268168 1 1 0
112268186 1 1 0
112268444 1 1 0
11243 0 1 1
112577516 1 1 1
112577592 1 1 0
112694756 1 1 1
113130 1 1 1
11346 1 1 1
114659 1 1 1
114769 0 1 1
114780 1 1 1
114825 1 1 1
114827 1 1 1
114841035 0 1 1
114991 1 1 1
115072896 1 1 1
116039 0 1 1
116804918 1 1 1
118142757 1 1 1
1192 0 1 1
120146 1 1 1
120376 0 1 1
121601 1 1 1
122651 1 1 1
1234 1 1 1
1235 1 1 1
126204 1 1 1
127943 1 1 1
128853 0 1 1
132001 1 1 0
133688 0 1 0
134860 0 1 1
135138 0 1 1
136332 1 1 1
137196 1 1 0
137682 1 1 1
138241 1 1 0
138639 1 1 1
138652 1 1 1
138715 0 1 1
139604 0 1 1
140686 1 1 0
140699 1 1 1
140731 1 1 1
140862 1 1 0
141 1 1 1
1432 1 1 0
143501 1 1 1
143502 0 1 1
1438 1 1 0
144203 1 1 1
1448 1 1 1
144983 1 1 1
1456 1 1 1
146227 1 1 1
146433 1 1 1
146713 1 1 1
146923 0 1 0
147 1 1 0
147166 1 1 1
148 1 1 0
148137 0 1 1
148231 1 1 1
148345 0 1 0
1488 1 1 0
149013 1 1 1
149708 0 1 1
150084 1 1 1
150681 1 1 1
150684 1 1 1
150763 0 1 1
151194 1 1 1
151648 0 1 1
1523 0 1 1
152404 0 0 1
152742 1 1 0
152815 1 1 1
152940 1 1 0
153090 1 1 1
155061 1 1 1
155184 1 1 0
157567 1 1 1
157697 1 1 1
158046 1 1 0
158055 1 1 1
158358 1 1 0
158435 1 1 0
158931 0 1 1
159091 1 1 0
159371 1 1 0
1600 1 1 0
160365 1 1 1
160492 1 1 0
160857 1 1 0
161142 1 1 1
162966 0 1 1
162972 1 1 1
1638 1 1 0
1646 1 1 1
165679 1 1 1
165721 0 1 1
166929 1 1 1
168975 0 0 1
169355 0 0 1
169834 1 1 1
169966 1 1 1
170082 1 1 1
170961 0 1 1
1767 0 1 0
1780 1 1 1
1800 0 1 1
1936 1 1 1
1998 0 1 1
199990 1 1 1
200008 0 1 1
200172 1 1 1
2009 0 1 1
200959 1 1 1
201516 1 1 1
201625 1 1 1
2036 1 1 0
2078 1 1 1
2104 1 1 0
2187 1 1 1
219595 1 1 1
219958 0 1 1
2205 1 1 1
221301 1 1 1
221357 1 1 1
221946 1 1 0
222611 1 1 1
222967 1 1 0
2258 1 1 1
2259 0 1 1
22797 0 1 1
22829 1 1 1
22859 1 1 1
22871 1 1 1
22903 0 1 0
22950 1 1 1
22987 1 1 0
22995 1 1 1
22998 0 1 1
23026 1 1 0
23031 1 1 1
23089 1 1 1
23143 1 1 1
23199 1 1 1
23243 0 1 1
23247 1 1 1
23254 0 1 1
23263 0 1 1
23271 1 1 1
23353 0 1 1
23362 0 1 1
23426 0 0 1
23499 1 1 1
23520 1 1 1
23539 0 0 1
23543 1 1 1
23567 0 1 0
23761 0 1 1
245913 0 1 1
246175 1 1 1
2491 1 1 1
25 1 1 1
2542 0 1 1
254272 1 1 0
254773 1 1 1
2549 0 1 1
256076 1 1 1
257313 1 1 1
25769 1 1 1
25776 1 1 1
25777 1 1 1
25822 1 1 1
25941 0 1 1
25945 1 1 1
25979 1 1 1
26013 1 1 1
260293 1 1 1
26083 1 1 1
26147 0 1 0
26231 1 1 1
26272 1 1 0
26468 1 1 0
26609 1 1 1
267012 0 1 1
27077 1 1 1
27094 1 1 1
27097 1 1 1
27098 0 1 1
27141 1 1 1
27180 1 1 0
28 1 1 1
280664 0 0 1
28231 1 1 0
2825 1 1 1
282966 0 1 1
282973 1 1 1
282974 1 1 1
283463 1 1 1
283694 0 0 1
283848 1 1 1
283982 1 1 1
284361 1 1 0
284371 0 1 1
284498 1 1 1
284521 1 1 1
285346 1 1 1
285362 1 1 1
285498 1 1 1
285555 1 1 1
285834 1 1 0
2859 1 1 1
286 0 1 1
286187 0 1 1
286223 1 1 1
286749 1 1 1
287 1 1 1
2873 1 1 1
29058 1 1 1
2914 0 1 1
29850 0 1 1
29914 1 1 1
29990 1 1 1
29998 1 1 1
3012 0 1 1
3017 0 1 1
30820 0 1 1
3084 1 1 1
3164 1 1 1
3172 1 1 1
317703 1 1 0
3187 1 1 0
324 1 0 1
3320 0 1 1
3338 0 1 1
338872 1 1 1
338879 1 1 1
339501 0 1 0
339512 1 1 1
339766 1 1 1
340069 0 1 1
340094 0 1 1
340205 0 1 1
340441 1 1 1
340526 1 1 1
340595 1 1 1
341346 1 1 1
342931 1 1 1
343068 1 1 1
3441 0 1 1
344892 1 1 1
345062 0 1 1
347169 1 1 1
347475 1 1 1
348751 1 1 1
353149 0 1 1
3556 1 1 1
3581 0 1 0
359710 0 1 1
359845 1 1 1
360226 0 1 1
3609 0 1 1
3655 0 1 1
3699 0 1 1
3712 0 1 1
373856 1 1 1
3742 1 1 1
374286 0 1 0
374308 1 1 1
374462 1 1 1
374864 1 1 1
374900 0 1 1
374907 0 1 1
374986 1 1 1
375449 1 1 1
375616 0 1 0
375748 1 1 1
375757 0 1 1
3778 1 1 1
3801 0 0 1
3805 0 1 1
3809 1 1 1
3811 1 1 1
3831 0 1 1
387104 1 1 1
387263 1 1 1
387522 1 1 1
387723 1 1 0
388199 1 1 1
388276 0 1 1
388324 1 1 1
388677 1 1 1
388685 1 1 0
388761 1 1 1
388795 1 1 1
388960 0 1 1
389197 1 1 1
389199 0 1 1
389690 1 1 1
389895 1 1 0
390226 1 1 1
390748 1 1 1
390877 0 1 1
391003 1 1 1
392490 1 1 1
392617 1 1 1
3983 1 1 1
400073 1 1 1
400087 1 1 0
400499 1 1 1
400533 1 1 0
400620 1 1 0
400797 1 1 0
400818 1 1 1
400986 1 1 1
401067 1 1 1
401082 1 1 1
401115 1 1 1
401149 1 1 0
401237 1 1 1
401399 1 1 1
401497 0 1 0
401992 1 1 1
401994 1 1 1
402160 1 1 1
4033 0 1 0
403323 1 1 0
403341 1 1 1
404734 1 1 1
407977 0 1 1
408186 1 1 1
414061 0 1 1
4168 1 1 1
4174 1 1 0
433 0 1 1
439915 0 1 1
440348 1 1 1
440352 1 1 1
440353 1 1 1
440556 1 1 1
440689 1 1 1
440896 1 1 0
440993 1 1 1
441108 0 0 1
441155 1 1 1
441239 0 1 0
441251 1 1 1
441452 1 1 1
441459 1 1 1
441525 1 1 1
441631 1 1 0
441669 1 1 0
441911 1 1 1
442179 1 1 0
445347 1 1 1
445372 0 1 1
445815 1 1 1
448834 1 1 1
4583 1 1 1
463 1 1 1
4636 1 1 1
474354 1 1 1
4802 0 1 1
4883 0 0 1
493754 1 1 0
493901 0 1 1
4946 0 1 1
4948 1 1 0
4957 1 1 1
4988 0 1 1
4992 1 1 1
4995 1 1 1
503834 1 1 1
50514 1 1 1
50636 0 1 1
50650 0 1 1
5099 1 1 1
51031 1 1 1
51043 1 1 1
51088 1 1 0
51207 0 1 1
51265 1 1 0
51266 1 1 1
51351 1 1 1
5136 1 1 1
5137 1 1 1
5143 1 1 1
51435 1 1 0
5144 0 1 1
51466 1 1 0
5154 0 1 0
51557 1 1 0
51686 1 1 1
5217 0 1 1
5271 1 1 0
5308 0 1 1
5320 0 1 1
53354 1 1 1
5339 0 1 1
53405 1 1 1
53616 0 0 1
53632 1 1 1
54097 1 1 1
5430 1 1 1
54331 1 1 1
54332 0 1 0
5437 0 1 1
54536 0 0 1
54585 0 0 1
5475 1 1 1
54825 1 1 1
548644 1 1 1
54914 1 1 1
5495 1 1 1
54970 0 1 0
54998 0 1 1
55 1 1 1
55041 1 1 1
55179 0 1 1
5522 0 1 1
55267 1 1 1
5527 1 1 1
55284 0 1 1
552891 0 1 1
55297 0 1 1
553158 0 1 1
55324 1 1 1
55350 0 1 1
55384 1 1 1
55507 1 1 1
55521 0 1 1
55567 1 1 1
55672 1 1 1
55715 0 1 1
55734 0 1 1
55769 1 1 1
55793 1 1 1
55809 1 1 1
55819 1 1 1
55841 1 1 1
55959 1 1 1
55964 1 1 1
56165 1 1 1
56341 0 1 1
5646 0 1 1
56606 1 1 0
56658 0 1 1
56901 0 1 1
56907 1 1 1
56911 1 1 1
56918 1 1 1
56924 0 1 1
56938 1 1 1
56955 1 1 1
5697 1 1 1
56971 1 1 1
56977 0 1 0
57002 0 1 1
57047 1 1 1
57093 1 1 0
57153 1 1 1
57180 1 1 0
5733 1 1 1
57451 1 1 1
57480 1 1 1
57489 1 1 1
57529 1 1 1
57571 1 1 1
57583 1 1 1
57624 1 1 1
57642 1 1 1
57644 0 0 1
57674 0 1 1
57690 1 1 1
57705 1 1 1
57821 1 1 1
57835 1 1 1
5900 0 0 1
5906 1 1 1
5983 1 1 1
5988 1 1 1
5998 1 1 1
6006 1 1 1
60312 1 1 1
60437 1 1 1
60487 1 1 0
60490 1 1 0
60506 1 1 1
6092 1 1 1
6101 1 1 1
613037 0 1 0
613211 1 1 1
613227 1 1 1
6196 0 1 1
6276 0 1 1
631 1 1 1
6370 0 1 1
64067 1 1 1
64097 1 1 1
641517 1 1 1
642484 1 1 1
642515 0 1 1
642612 1 1 1
642799 0 1 0
642819 1 1 1
643311 0 1 1
643314 0 1 1
643376 1 1 1
643382 1 1 1
64342 1 1 0
643723 1 1 0
644100 1 1 1
645191 1 1 1
645202 1 1 1
645369 1 1 1
645414 1 1 1
645425 1 1 1
64582 1 1 1
646262 1 1 1
646588 1 1 0
646915 1 1 0
647264 1 1 0
64756 0 1 1
64770 1 1 1
64860 0 1 1
650226 1 1 0
65109 0 1 0
65249 0 1 0
65265 0 1 0
653061 1 1 1
653149 1 1 1
653857 1 1 0
654231 1 1 1
65982 0 0 1
65983 0 1 1
6675 0 1 1
6698 1 1 1
6733 1 1 0
6757 0 0 1
6788 1 1 1
6867 0 1 1
692312 0 1 1
7005 1 1 1
7037 1 1 1
7082 0 1 1
7089 0 1 1
7182 0 1 1
7275 1 1 1
727800 0 1 1
728194 1 1 0
728392 1 1 1
728441 1 1 1
728588 1 1 1
728763 1 1 1
728888 1 1 1
728957 1 1 1
728989 1 1 1
729218 1 1 0
729262 0 1 0
729359 1 1 1
729428 1 1 1
729767 0 1 1
729978 1 1 1
730005 0 0 1
730098 1 1 1
731275 1 1 1
7402 1 1 1
7417 0 1 1
7422 1 0 1
744 1 1 1
7466 1 1 0
7494 0 1 1
7547 0 0 1
7587 0 1 1
7594 0 1 0
7644 1 1 0
767558 0 0 1
7739 1 1 1
7757 0 1 0
7767 1 1 1
777 0 1 0
7771 0 1 1
7775 1 1 1
7798 1 1 1
79015 1 1 0
79020 1 1 1
79037 0 1 1
79081 0 1 1
79091 1 1 0
79470 1 1 1
79589 0 1 1
79676 1 1 1
79750 1 1 0
79784 0 1 0
79917 0 1 1
79921 0 1 1
79931 0 1 1
79949 0 1 1
80036 0 0 1
80095 1 1 1
80125 1 1 1
80254 0 1 1
80728 1 1 1
80741 0 1 1
80817 1 1 1
80833 1 1 1
80854 0 1 1
81328 1 1 1
81458 1 1 1
81889 0 1 1
8225 0 1 1
83259 1 1 1
83463 0 1 1
8347 1 1 1
83551 0 1 1
84253 0 0 1
84309 1 1 1
84337 0 1 1
84460 0 1 1
8453 0 1 0
84614 1 1 1
8470 1 1 1
84725 0 1 1
84750 1 1 0
84765 1 1 1
84790 0 1 1
84808 1 1 1
84851 1 1 1
84876 0 1 1
84924 1 1 1
84985 1 1 1
8525 1 1 1
85358 1 1 1
85395 0 1 1
85397 1 1 1
85439 1 1 1
85441 1 1 1
85452 1 1 1
8638 0 1 0
8668 0 1 1
8681 1 1 1
8711 0 1 1
8785 1 1 1
8801 0 1 1
8857 1 1 1
89780 1 1 0
89795 0 1 1
89796 1 1 1
8996 0 1 1
90007 1 1 1
90019 1 1 1
90050 1 1 1
90113 1 1 1
90293 1 1 1
9033 1 1 0
90362 1 1 1
9038 1 1 1
90416 1 1 1
90417 0 1 1
90523 1 1 0
90850 1 1 1
91227 0 1 0
91526 0 1 0
9154 1 1 1
91544 1 1 1
91646 1 1 1
91833 1 1 1
91869 1 1 0
92292 1 1 1
9274 0 1 1
92806 1 1 1
92949 0 1 0
93594 1 1 0
94025 1 1 0
94121 1 1 1
9425 0 1 1
9478 1 1 1
9498 0 1 1
9534 1 1 0
9612 1 1 1
9628 1 1 0
9645 1 1 1
9651 1 1 1
9686 1 1 1
9693 1 1 1
9721 1 1 1
9734 1 1 1
9738 1 1 1
9745 1 1 0
9758 0 1 0
9887 1 1 1
9906 1 1 1
9922 0 1 0
9942 1 1 0
9994 1 1 1
ENSG00000112096 1 1 1
ENSG00000182230 1 1 1
ENSG00000215271 1 1 1
ENSG00000221995 1 1 1
ENSG00000249860 1 1 1
ENSG00000256045 1 1 1
ENSG00000256222 1 1 1
ENSG00000256892 1 1 0
ENSG00000269028 1 1 0
ENSG00000270188 1 1 0
ENSG00000270394 1 1 0
# Display samples in removed ENSEMBL IDs that are in PMTL list
filtered_ensg %>% dplyr::filter(Ensembl_ID %in% pmtl_ids) %>%
select(Ensembl_ID, Kids_First_Biospecimen_ID) %>%
dplyr::distinct()
# Remove data frames
rm(variant_level_snv, filtered_ensg, consensus_ensg)
# read mtp table
mtp_file <-
file.path(input_dir, "gene-level-cnv-consensus-annotated-mut-freq.tsv.gz")
gene_level_cnv <-
data.table::fread(mtp_file, sep = "\t", showProgress = FALSE) %>%
tibble::as_tibble()
# filter mtp table
file_name <- basename(mtp_file)
filter_mtp_table(gene_level_cnv, ensg_ids, file_name)
# Ensembl gene identifiers removed
filtered_ensg <- gene_level_cnv %>%
dplyr::filter(!targetFromSourceId %in% ensg_ids) %>%
dplyr::select(Gene_symbol, targetFromSourceId) %>%
dplyr::rename(Ensembl_ID = targetFromSourceId) %>%
dplyr::distinct()
# read the CNV consensus file
consensus_ensg <- data.table::fread(
file.path(data_dir, "consensus_wgs_plus_cnvkit_wxs.tsv.gz"),
select = c("biospecimen_id", "ensembl"), showProgress = FALSE) %>%
tibble::as_tibble() %>%
dplyr::filter(!is.na(ensembl)) %>%
dplyr::rename(Kids_First_Biospecimen_ID = biospecimen_id) %>%
dplyr::distinct()
# merge sample IDs to filtered OT ENSG IDs
filtered_ensg <- filtered_ensg %>%
dplyr::left_join(consensus_ensg, by=c("Ensembl_ID" = "ensembl"))
# merge OpenPedCan cohorts to filtered OT ENSG IDs
filtered_ensg <- filtered_ensg %>%
dplyr::left_join(sample_ids, by= "Kids_First_Biospecimen_ID")
# write to file
readr::write_tsv(filtered_ensg,
file.path(results_dir,
"gene-level-cnv-consensus-annotated-mut-freq_dropped_ensg.tsv.gz"))
# Display removed ENSEMBL IDs
filtered_ensg %>% dplyr::select(Ensembl_ID, cohort) %>%
dplyr::distinct() %>%
tidyr::drop_na() %>%
table()
cohort
Ensembl_ID GMKF PBTA TARGET
ENSG00000112096 1 1 1
ENSG00000161149 1 1 1
ENSG00000182230 1 1 1
ENSG00000215271 1 1 1
ENSG00000221995 1 1 1
ENSG00000228139 1 1 1
ENSG00000232196 1 1 1
ENSG00000233776 1 1 1
ENSG00000249860 1 1 1
ENSG00000255823 1 1 1
ENSG00000256045 1 1 1
ENSG00000256222 1 1 1
ENSG00000256618 1 1 1
ENSG00000256892 1 1 1
ENSG00000258808 1 1 1
ENSG00000258861 1 1 1
ENSG00000261737 1 1 1
ENSG00000269900 1 1 1
ENSG00000270672 1 1 1
ENSG00000273888 1 1 1
ENSG00000283486 1 1 1
# Display samples in removed ENSEMBL IDs that are in PMTL list
filtered_ensg %>% dplyr::filter(Ensembl_ID %in% pmtl_ids) %>%
select(Ensembl_ID, Kids_First_Biospecimen_ID) %>%
dplyr::distinct()
# Remove data frames
rm(gene_level_cnv, filtered_ensg, consensus_ensg)
# read mtp table
mtp_file <- file.path(input_dir, "putative-oncogene-fusion-freq.tsv.gz")
fusion <-
data.table::fread(mtp_file, sep = "\t", showProgress = FALSE) %>%
tibble::as_tibble()
# filter mtp table
file_name <- basename(mtp_file)
filter_mtp_table(fusion, ensg_ids, file_name)
# Ensembl gene identifiers removed
filtered_ensg <- fusion %>%
dplyr::filter(!targetFromSourceId %in% ensg_ids) %>%
dplyr::select(FusionName, Gene_symbol, targetFromSourceId) %>%
dplyr::rename(Ensembl_ID = targetFromSourceId) %>%
dplyr::distinct()
# split fusion name into constituent genes symbols
fusion_names <- data.table::fread(
file.path(data_dir, "fusion-putative-oncogenic.tsv"),
select = c("Sample", "FusionName"), showProgress = FALSE) %>%
tibble::as_tibble() %>%
tidyr::separate_rows(FusionName, sep = "--", convert = FALSE) %>%
tidyr::separate_rows(FusionName, sep = '/', convert = FALSE) %>%
dplyr::rename(Kids_First_Biospecimen_ID = Sample,
Gene_symbol = FusionName) %>%
dplyr::filter(!is.na(Gene_symbol)) %>%
dplyr::distinct()
# merge sample IDs to filtered OT ENSG IDs
filtered_ensg <- filtered_ensg %>%
dplyr::left_join(fusion_names, by="Gene_symbol") %>%
dplyr::select(-FusionName) %>%
dplyr::distinct()
# merge OpenPedCan cohorts to filtered OT ENSG IDs
filtered_ensg <- filtered_ensg %>%
dplyr::left_join(sample_ids, by = "Kids_First_Biospecimen_ID")
# write to file
readr::write_tsv(filtered_ensg,
file.path(results_dir,
"putative-oncogene-fusion-freq_dropped_ensg.tsv.gz"))
# Display removed ENSEMBL IDs
filtered_ensg %>% dplyr::select(Ensembl_ID, cohort) %>%
dplyr::distinct() %>%
tidyr::drop_na() %>%
table()
cohort
Ensembl_ID GMKF PBTA TARGET
0 1 1
ENSG00000112096 0 1 1
ENSG00000137808 0 0 1
ENSG00000215271 0 0 1
ENSG00000224739 0 0 1
ENSG00000226380 0 1 1
ENSG00000251044 0 1 0
ENSG00000255633 0 0 1
ENSG00000261737 0 1 0
ENSG00000269028 0 0 1
ENSG00000269900 0 1 1
ENSG00000273301 0 1 0
ENSG00000273730 0 1 1
ENSG00000274202 0 1 0
ENSG00000275405 0 0 1
ENSG00000275757 0 1 1
ENSG00000275987 0 0 1
ENSG00000276345 0 0 1
ENSG00000276612 0 0 1
ENSG00000276711 0 1 0
ENSG00000276932 1 1 1
ENSG00000277374 0 0 1
ENSG00000277428 1 1 1
ENSG00000277890 1 1 1
ENSG00000277927 1 1 1
ENSG00000278294 0 1 1
ENSG00000278457 0 1 0
ENSG00000278625 1 1 1
ENSG00000278793 1 1 1
ENSG00000279765 0 0 1
ENSG00000282965 0 1 0
ENSG00000284299 0 1 0
ENSG00000285106 1 1 1
ENSG00000285762 1 1 0
ENSG00000288884 0 0 1
ENSG00000289638 0 0 1
# Display samples in removed ENSEMBL IDs that are in PMTL list
filtered_ensg %>% dplyr::filter(Ensembl_ID %in% pmtl_ids) %>%
select(Ensembl_ID, Kids_First_Biospecimen_ID) %>%
dplyr::distinct()
# Remove frequencies data frame
rm(fusion, filtered_ensg)
# read mtp table
mtp_file <- file.path(input_dir, "putative-oncogene-fused-gene-freq.tsv.gz")
fused_gene <-
data.table::fread(mtp_file, sep = "\t", showProgress = FALSE) %>%
tibble::as_tibble()
# filter mtp table
file_name <- basename(mtp_file)
filter_mtp_table(fused_gene, ensg_ids, file_name)
# Ensembl gene identifiers removed
filtered_ensg <- fused_gene %>%
dplyr::filter(!targetFromSourceId %in% ensg_ids) %>%
dplyr::select(Gene_symbol, targetFromSourceId) %>%
dplyr::rename(Ensembl_ID = targetFromSourceId) %>%
dplyr::distinct()
# merge sample IDs to filtered OT ENSG IDs
filtered_ensg <- filtered_ensg %>%
dplyr::left_join(fusion_names, by = "Gene_symbol") %>%
dplyr::distinct()
# merge sample IDs to filtered OT ENSG IDs
filtered_ensg <- filtered_ensg %>%
dplyr::left_join(sample_ids, by = "Kids_First_Biospecimen_ID")
# write to file
readr::write_tsv(filtered_ensg,
file.path(results_dir,
"putative-oncogene-fused-gene-freq_dropped_ensg.tsv.gz"))
# Display removed ENSEMBL IDs
filtered_ensg %>% dplyr::select(Ensembl_ID, cohort) %>%
dplyr::distinct() %>%
tidyr::drop_na() %>%
table()
cohort
Ensembl_ID GMKF PBTA TARGET
0 1 1
ENSG00000112096 0 1 1
ENSG00000137808 0 0 1
ENSG00000215271 0 0 1
ENSG00000224739 0 0 1
ENSG00000226380 0 1 1
ENSG00000251044 0 1 0
ENSG00000255633 0 0 1
ENSG00000261737 0 1 0
ENSG00000269028 0 0 1
ENSG00000269900 0 1 1
ENSG00000273301 0 1 0
ENSG00000273730 0 1 1
ENSG00000274202 0 1 0
ENSG00000275405 0 0 1
ENSG00000275757 0 1 1
ENSG00000275987 0 0 1
ENSG00000276345 0 0 1
ENSG00000276612 0 0 1
ENSG00000276711 0 1 0
ENSG00000276932 1 1 1
ENSG00000277374 0 0 1
ENSG00000277428 1 1 1
ENSG00000277890 1 1 1
ENSG00000277927 1 1 1
ENSG00000278294 0 1 1
ENSG00000278457 0 1 0
ENSG00000278625 1 1 1
ENSG00000278793 1 1 1
ENSG00000279765 0 0 1
ENSG00000282965 0 1 0
ENSG00000284299 0 1 0
ENSG00000285106 1 1 1
ENSG00000285762 1 1 0
ENSG00000288884 0 0 1
ENSG00000289638 0 0 1
# Display samples in removed ENSEMBL IDs that are in PMTL list
filtered_ensg %>% dplyr::filter(Ensembl_ID %in% pmtl_ids) %>%
select(Ensembl_ID, Kids_First_Biospecimen_ID) %>%
dplyr::distinct()
# Remove frequencies data frame
rm(fused_gene, filtered_ensg, fusion_names)
OpenPedCan collapsed gene expressions TPM matrix only contains gene symbols and not ENSEMBL gene identifiers. Therefore, samples associated with removed ENSEMBL identifiers cannot be listed.
# read mtp table
mtp_file <-
file.path(input_dir, "long_n_tpm_mean_sd_quantile_group_wise_zscore.tsv.gz")
tpm_group_wise_stats <-
data.table::fread(mtp_file, sep = "\t", showProgress = FALSE) %>%
tibble::as_tibble()
# filter mtp table
file_name <- basename(mtp_file)
filter_mtp_table(tpm_group_wise_stats, ensg_ids, file_name)
# Ensembl gene identifiers removed
filtered_ensg <- tpm_group_wise_stats %>%
dplyr::filter(!Gene_Ensembl_ID %in% ensg_ids) %>%
dplyr::select(Gene_symbol, Gene_Ensembl_ID, cohort) %>%
dplyr::rename(Ensembl_ID = Gene_Ensembl_ID) %>%
dplyr::distinct()
# write to file
readr::write_tsv(filtered_ensg,
file.path(results_dir,
"long_n_tpm_mean_sd_quantile_group_wise_zscore_dropped_ensg.tsv.gz"))
# Display removed ENSEMBL IDs
filtered_ensg %>% dplyr::select(Ensembl_ID, cohort) %>%
dplyr::filter(cohort != "All Cohorts") %>%
dplyr::distinct() %>%
tidyr::drop_na() %>%
table()
cohort
Ensembl_ID GMKF PBTA TARGET TCGA
ENSG00000112096 1 1 1 1
ENSG00000204092 1 1 1 1
ENSG00000212951 1 1 1 1
ENSG00000215271 1 1 1 1
ENSG00000221995 1 1 1 1
ENSG00000224739 1 1 1 1
ENSG00000226377 1 1 1 1
ENSG00000226380 1 1 1 1
ENSG00000227021 1 1 1 1
ENSG00000227220 1 1 1 1
ENSG00000227902 1 1 1 1
ENSG00000228139 1 1 1 1
ENSG00000228206 1 1 1 1
ENSG00000228906 1 1 1 1
ENSG00000229352 1 1 1 1
ENSG00000232196 1 1 1 1
ENSG00000233776 1 1 1 1
ENSG00000236166 1 1 1 1
ENSG00000236673 1 1 1 1
ENSG00000236886 1 1 1 1
ENSG00000239467 1 1 1 1
ENSG00000239665 1 1 1 1
ENSG00000249860 1 1 1 1
ENSG00000251044 1 1 1 1
ENSG00000253878 1 1 1 1
ENSG00000254561 1 1 1 1
ENSG00000254740 1 1 1 1
ENSG00000255633 1 1 1 1
ENSG00000255823 1 1 1 1
ENSG00000256045 1 1 1 1
ENSG00000256222 1 1 1 1
ENSG00000256427 1 1 1 1
ENSG00000256618 1 1 1 1
ENSG00000256892 1 1 1 1
ENSG00000258414 1 1 1 1
ENSG00000258808 1 1 1 1
ENSG00000258861 1 1 1 1
ENSG00000259834 1 1 1 1
ENSG00000259855 1 1 1 1
ENSG00000260461 1 1 1 1
ENSG00000261068 1 1 1 1
ENSG00000261490 1 1 1 1
ENSG00000261773 1 1 1 1
ENSG00000269028 1 1 1 1
ENSG00000270188 1 1 1 1
ENSG00000270394 1 1 1 1
ENSG00000270672 1 1 1 1
ENSG00000271043 1 1 1 1
ENSG00000271409 1 1 1 1
ENSG00000271734 1 1 1 1
ENSG00000271870 1 1 1 1
ENSG00000272551 1 1 1 1
ENSG00000272567 1 1 1 1
ENSG00000272880 1 1 1 1
ENSG00000273301 1 1 1 1
ENSG00000273614 1 1 1 1
ENSG00000273837 1 1 1 1
ENSG00000273888 1 1 1 1
ENSG00000273923 1 1 1 1
ENSG00000277050 1 1 1 1
ENSG00000277352 1 1 1 1
ENSG00000278927 1 1 1 1
ENSG00000278955 1 1 1 1
ENSG00000279226 1 1 1 1
ENSG00000279769 1 1 1 1
ENSG00000279948 1 1 1 1
ENSG00000280058 1 1 1 1
ENSG00000280095 1 1 1 1
ENSG00000280346 1 1 1 1
ENSG00000280374 1 1 1 1
ENSG00000282965 1 1 1 1
# Remove summary statistics data frame
rm(tpm_group_wise_stats, filtered_ensg)
OpenPedCan collapsed gene expressions TPM matrix only contains gene symbols and not ENSEMBL gene identifiers. Therefore, samples associated with removed ENSEMBL identifiers cannot be listed.
# read mtp table
mtp_file <-
file.path(input_dir, "long_n_tpm_mean_sd_quantile_gene_wise_zscore.tsv.gz")
tpm_gene_wise_stats <-
data.table::fread(mtp_file, sep = "\t", showProgress = FALSE) %>%
tibble::as_tibble()
# filter mtp table
file_name <- basename(mtp_file)
filter_mtp_table(tpm_gene_wise_stats, ensg_ids, file_name)
# Ensembl gene identifiers removed
filtered_ensg <- tpm_gene_wise_stats %>%
dplyr::filter(!Gene_Ensembl_ID %in% ensg_ids) %>%
dplyr::select(Gene_symbol, Gene_Ensembl_ID, cohort) %>%
dplyr::rename(Ensembl_ID = Gene_Ensembl_ID) %>%
dplyr::distinct()
# write to file
readr::write_tsv(filtered_ensg,
file.path(results_dir,
"long_n_tpm_mean_sd_quantile_gene_wise_zscore_dropped_ensg.tsv.gz"))
# Display removed ENSEMBL IDs
filtered_ensg %>% dplyr::select(Ensembl_ID, cohort) %>%
dplyr::filter(cohort != "All Cohorts") %>%
dplyr::distinct() %>%
tidyr::drop_na() %>%
table()
cohort
Ensembl_ID GMKF PBTA TARGET TCGA
ENSG00000112096 1 1 1 1
ENSG00000204092 1 1 1 1
ENSG00000212951 1 1 1 1
ENSG00000215271 1 1 1 1
ENSG00000221995 1 1 1 1
ENSG00000224739 1 1 1 1
ENSG00000226377 1 1 1 1
ENSG00000226380 1 1 1 1
ENSG00000227021 1 1 1 1
ENSG00000227220 1 1 1 1
ENSG00000227902 1 1 1 1
ENSG00000228139 1 1 1 1
ENSG00000228206 1 1 1 1
ENSG00000228906 1 1 1 1
ENSG00000229352 1 1 1 1
ENSG00000232196 1 1 1 1
ENSG00000233776 1 1 1 1
ENSG00000236166 1 1 1 1
ENSG00000236673 1 1 1 1
ENSG00000236886 1 1 1 1
ENSG00000239467 1 1 1 1
ENSG00000239665 1 1 1 1
ENSG00000249860 1 1 1 1
ENSG00000251044 1 1 1 1
ENSG00000253878 1 1 1 1
ENSG00000254561 1 1 1 1
ENSG00000254740 1 1 1 1
ENSG00000255633 1 1 1 1
ENSG00000255823 1 1 1 1
ENSG00000256045 1 1 1 1
ENSG00000256222 1 1 1 1
ENSG00000256427 1 1 1 1
ENSG00000256618 1 1 1 1
ENSG00000256892 1 1 1 1
ENSG00000258414 1 1 1 1
ENSG00000258808 1 1 1 1
ENSG00000258861 1 1 1 1
ENSG00000259834 1 1 1 1
ENSG00000259855 1 1 1 1
ENSG00000260461 1 1 1 1
ENSG00000261068 1 1 1 1
ENSG00000261490 1 1 1 1
ENSG00000261773 1 1 1 1
ENSG00000269028 1 1 1 1
ENSG00000270188 1 1 1 1
ENSG00000270394 1 1 1 1
ENSG00000270672 1 1 1 1
ENSG00000271043 1 1 1 1
ENSG00000271409 1 1 1 1
ENSG00000271734 1 1 1 1
ENSG00000271870 1 1 1 1
ENSG00000272551 1 1 1 1
ENSG00000272567 1 1 1 1
ENSG00000272880 1 1 1 1
ENSG00000273301 1 1 1 1
ENSG00000273614 1 1 1 1
ENSG00000273837 1 1 1 1
ENSG00000273888 1 1 1 1
ENSG00000273923 1 1 1 1
ENSG00000277050 1 1 1 1
ENSG00000277352 1 1 1 1
ENSG00000278927 1 1 1 1
ENSG00000278955 1 1 1 1
ENSG00000279226 1 1 1 1
ENSG00000279769 1 1 1 1
ENSG00000279948 1 1 1 1
ENSG00000280058 1 1 1 1
ENSG00000280095 1 1 1 1
ENSG00000280346 1 1 1 1
ENSG00000280374 1 1 1 1
ENSG00000282965 1 1 1 1
# Remove summary statistics data frame
rm(tpm_gene_wise_stats, filtered_ensg)
OpenPedCan contains methylation and rnaseq summaries of cohort cancer groups and can’t not associate with samples. Therefore, samples associated with removed ENSEMBL identifiers cannot be listed.
# read mtp table
mtp_file <- file.path(input_dir, "gene-methyl-beta-values-summary.tsv.gz")
gene_level_methyl <-
data.table::fread(mtp_file, sep = "\t", showProgress = FALSE) %>%
tibble::as_tibble()
# filter mtp table
file_name <- basename(mtp_file)
filter_mtp_table(gene_level_methyl, ensg_ids, file_name)
# Ensembl gene identifiers removed
filtered_ensg <- gene_level_methyl %>%
dplyr::filter(!targetFromSourceId %in% ensg_ids) %>%
dplyr::select(Gene_symbol, targetFromSourceId, Dataset) %>%
dplyr::rename(Ensembl_ID = targetFromSourceId, cohort = Dataset) %>%
dplyr::distinct()
# write to file
readr::write_tsv(filtered_ensg,
file.path(results_dir,
"gene-methyl-beta-values-summary_dropped_ensg.tsv.gz"))
# Display removed ENSEMBL IDs
filtered_ensg %>% dplyr::select(Ensembl_ID, cohort) %>%
dplyr::distinct() %>%
tidyr::drop_na() %>%
table()
cohort
Ensembl_ID PBTA TARGET
1 1
ENSG00000112096 1 1
ENSG00000204092 1 0
ENSG00000212951 1 0
ENSG00000215271 1 1
ENSG00000221995 1 1
ENSG00000224739 1 1
ENSG00000226377 1 1
ENSG00000226380 1 1
ENSG00000227021 1 1
ENSG00000227220 1 1
ENSG00000229352 1 1
ENSG00000232196 1 1
ENSG00000236166 1 1
ENSG00000236673 1 1
ENSG00000236886 1 1
ENSG00000239467 1 1
ENSG00000239665 1 1
ENSG00000249860 1 0
ENSG00000253878 1 1
ENSG00000254561 1 1
ENSG00000254740 1 1
ENSG00000255823 1 1
ENSG00000256222 1 1
ENSG00000256427 1 1
ENSG00000256618 1 1
ENSG00000256892 1 0
ENSG00000258414 1 0
ENSG00000258808 1 1
ENSG00000258861 1 1
ENSG00000261068 1 0
ENSG00000269900 1 1
ENSG00000270394 1 1
ENSG00000270672 1 1
ENSG00000273888 1 1
ENSG00000279948 1 0
ENSG00000282965 1 1
# Remove frequencies data frame
rm(gene_level_methyl, filtered_ensg)
OpenPedCan contains methylation and rnaseq summaries of cohort cancer groups and can’t not associate with samples. Therefore, samples associated with removed ENSEMBL identifiers cannot be listed.
# read mtp table
mtp_file <- file.path(input_dir, "isoform-methyl-beta-values-summary.tsv.gz")
isoform_level_methyl <-
data.table::fread(mtp_file, sep = "\t", showProgress = FALSE) %>%
tibble::as_tibble()
# filter mtp table
file_name <- basename(mtp_file)
filter_mtp_table(isoform_level_methyl, ensg_ids, file_name)
# Ensembl gene identifiers removed
filtered_ensg <- isoform_level_methyl %>%
dplyr::filter(!targetFromSourceId %in% ensg_ids) %>%
dplyr::select(Gene_symbol, targetFromSourceId, Dataset) %>%
dplyr::rename(Ensembl_ID = targetFromSourceId, cohort = Dataset) %>%
dplyr::distinct()
# write to file
readr::write_tsv(filtered_ensg,
file.path(results_dir,
"isoform-methyl-beta-values-summary_dropped_ensg.tsv.gz"))
# Display removed ENSEMBL IDs
filtered_ensg %>% dplyr::select(Ensembl_ID, cohort) %>%
dplyr::distinct() %>%
tidyr::drop_na() %>%
table()
cohort
Ensembl_ID PBTA TARGET
1 1
ENSG00000112096 1 1
ENSG00000137808 1 1
ENSG00000161149 1 1
ENSG00000182230 1 1
ENSG00000204092 1 0
ENSG00000212951 1 0
ENSG00000215271 1 1
ENSG00000221995 1 1
ENSG00000224739 1 1
ENSG00000226377 1 1
ENSG00000226380 1 1
ENSG00000227021 1 1
ENSG00000227220 1 1
ENSG00000229352 1 1
ENSG00000232196 1 1
ENSG00000236166 1 1
ENSG00000236673 1 1
ENSG00000236886 1 1
ENSG00000239446 1 1
ENSG00000239467 1 1
ENSG00000239665 1 1
ENSG00000249860 1 0
ENSG00000253878 1 1
ENSG00000254561 1 1
ENSG00000254740 1 1
ENSG00000255823 1 1
ENSG00000256222 1 1
ENSG00000256427 1 1
ENSG00000256618 1 1
ENSG00000256892 1 0
ENSG00000258414 1 0
ENSG00000258808 1 1
ENSG00000258861 1 1
ENSG00000261068 1 0
ENSG00000261737 1 1
ENSG00000269900 1 1
ENSG00000270394 1 1
ENSG00000270672 1 1
ENSG00000272904 1 1
ENSG00000273888 1 1
ENSG00000279765 1 1
ENSG00000279948 1 0
ENSG00000282246 1 1
ENSG00000282965 1 1
ENSG00000283486 1 1
ENSG00000284299 1 1
ENSG00000285106 1 1
ENSG00000285476 1 1
ENSG00000285762 1 1
ENSG00000286601 1 1
ENSG00000286996 1 1
ENSG00000287116 1 1
ENSG00000288541 1 0
ENSG00000288686 1 1
ENSG00000288847 1 1
ENSG00000288870 1 1
ENSG00000288943 1 1
ENSG00000288981 1 1
ENSG00000289638 1 1
# Remove frequencies data frame
rm(isoform_level_methyl, filtered_ensg)
sessionInfo()
R version 3.6.0 (2019-04-26)
Platform: x86_64-pc-linux-gnu (64-bit)
Running under: Debian GNU/Linux 9 (stretch)
Matrix products: default
BLAS/LAPACK: /usr/lib/libopenblasp-r0.2.19.so
locale:
[1] LC_CTYPE=en_US.UTF-8 LC_NUMERIC=C
[3] LC_TIME=en_US.UTF-8 LC_COLLATE=en_US.UTF-8
[5] LC_MONETARY=en_US.UTF-8 LC_MESSAGES=C
[7] LC_PAPER=en_US.UTF-8 LC_NAME=C
[9] LC_ADDRESS=C LC_TELEPHONE=C
[11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C
attached base packages:
[1] parallel stats4 stats graphics grDevices utils datasets
[8] methods base
other attached packages:
[1] jsonlite_1.6 data.table_1.12.2 forcats_0.4.0
[4] stringr_1.4.0 dplyr_0.8.3 purrr_0.3.2
[7] readr_1.3.1 tidyr_0.8.3 tibble_2.1.3
[10] ggplot2_3.2.0 tidyverse_1.2.1 rtracklayer_1.46.0
[13] GenomicRanges_1.38.0 GenomeInfoDb_1.22.1 IRanges_2.20.2
[16] S4Vectors_0.24.4 BiocGenerics_0.32.0
loaded via a namespace (and not attached):
[1] Biobase_2.46.0 httr_1.4.0
[3] R.utils_2.9.0 modelr_0.1.4
[5] assertthat_0.2.1 GenomeInfoDbData_1.2.2
[7] cellranger_1.1.0 Rsamtools_2.2.3
[9] yaml_2.2.0 pillar_1.4.2
[11] backports_1.1.4 lattice_0.20-38
[13] glue_1.3.1 digest_0.6.20
[15] XVector_0.26.0 rvest_0.3.4
[17] colorspace_1.4-1 htmltools_0.3.6
[19] Matrix_1.2-17 R.oo_1.22.0
[21] XML_3.98-1.20 pkgconfig_2.0.2
[23] broom_0.5.2 haven_2.1.1
[25] zlibbioc_1.32.0 scales_1.0.0
[27] BiocParallel_1.20.1 generics_0.0.2
[29] withr_2.1.2 SummarizedExperiment_1.16.1
[31] lazyeval_0.2.2 cli_1.1.0
[33] magrittr_1.5 crayon_1.3.4
[35] readxl_1.3.1 evaluate_0.14
[37] R.methodsS3_1.7.1 nlme_3.1-140
[39] xml2_1.2.0 tools_3.6.0
[41] hms_0.4.2 matrixStats_0.54.0
[43] munsell_0.5.0 DelayedArray_0.12.3
[45] Biostrings_2.54.0 compiler_3.6.0
[47] rlang_0.4.0 grid_3.6.0
[49] RCurl_1.95-4.12 rstudioapi_0.10
[51] bitops_1.0-6 base64enc_0.1-3
[53] rmarkdown_1.13 gtable_0.3.0
[55] R6_2.4.0 GenomicAlignments_1.22.1
[57] lubridate_1.7.4 knitr_1.23
[59] rprojroot_1.3-2 stringi_1.4.3
[61] Rcpp_1.0.1 tidyselect_0.2.5
[63] xfun_0.8